import pandas as pds
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns #for plotting
from sklearn.ensemble import RandomForestClassifier #for the model
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz #plot tree
from sklearn.metrics import roc_curve, auc #for model evaluation
from sklearn.metrics import classification_report #for model evaluation
from sklearn.metrics import confusion_matrix #for model evaluation
from sklearn.model_selection import train_test_split #for data splitting
import eli5 #for purmutation importance
from eli5.sklearn import PermutationImportance
import shap #for SHAP values
from pdpbox import pdp, info_plots #for partial plots
import warnings
warnings.filterwarnings('ignore')
print("> Reading from data file and putting headers")
dataframe = pds.read_csv("E:\\New_Big _data\\Program\\processed.cleveland.csv",
header=None,
names=['age', 'sex', 'cp', 'trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num'])
print("dataframe >>")
dataframe
print("> Replace empty values with NaN and meaningful value")
dataframe_re = dataframe.replace(to_replace ="?",value ="NaN")
print("dataframe_re >>")
dataframe_re
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy="median",axis=0)
imp = imp.fit(dataframe_re)
imp_df = imp.transform(dataframe_re)
print("imp_df >>")
print(type(imp_df))
new_df = pds.DataFrame(imp_df, columns =['age', 'sex', 'cp', 'trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num'])
print("new_df >>")
new_df
print("> Outliner detection for each column")
def detect_outlier(data_1):
outliers=[]
threshold=3
mean_1 = np.mean(data_1)
std_1 =np.std(data_1)
ourliers_row=[]
i = 0
for y in data_1:
i += 1
z_score= (y - mean_1)/std_1
if np.abs(z_score) > threshold:
outliers.append(y)
ourliers_row.append(i)
return ourliers_row
out_list=[]
for col in list(new_df.columns):
print(col)
out_list.extend(detect_outlier(new_df[col].values))
print(out_list)
print("> Check and use common datatype")
print(new_df.dtypes)
new_df['age'] = new_df['age'].astype('int64')
new_df['sex'] = new_df['sex'].astype('int64')
new_df['cp'] = new_df['cp'].astype('int64')
new_df['trestbps'] = new_df['trestbps'].astype('int64')
new_df['chol'] = new_df['chol'].astype('int64')
new_df['cp'] = new_df['cp'].astype('int64')
new_df['fbs'] = new_df['fbs'].astype('int64')
new_df['restecg'] = new_df['restecg'].astype('int64')
new_df['thalach'] = new_df['thalach'].astype('int64')
new_df['exang'] = new_df['exang'].astype('int64')
new_df['oldpeak'] = new_df['slope'].astype('int64')
new_df['slope'] = new_df['slope'].astype('int64')
new_df['ca'] = new_df['ca'].astype('int64')
new_df['thal'] = new_df['thal'].astype('int64')
new_df['num'] = new_df['num'].astype('int64')
print(new_df.dtypes)
print("Draw correlation matrix for feature selection")
corrmat = new_df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
heatmapResult=sns.heatmap(new_df[top_corr_features].corr(),annot=True,cmap="RdYlGn")
plt.savefig('heatmapResult.png')
plt.show()
print("Need to drop thalach due to feature engineering")
df = new_df.drop(['thalach'],axis=1)
df.head(10)
print("New correlation matrix for feature selection")
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
heatmapResultNew=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")
plt.savefig('heatmapResultNew.png')
plt.show()
print("Convert and categorise num into 0 or 1 and remove num")
heartdisease_map = {0:0,1:1,2:1,3:1,4:1}
df['heartdisease'] = df['num'].map(heartdisease_map)
df = df.drop(['num'],axis=1)
df.head(10)
print("Splitting the data in training and testing")
x_data = df.drop('heartdisease', 1)
y_data = df['heartdisease']
x_train, x_test, y_train, y_test = train_test_split(x_data , y_data , test_size = .2, random_state=10)
print("Training Random Forest Classification")
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 1000, random_state = 1,max_depth=9)
rf.fit(x_train, y_train)
accuracy = rf.score(x_test,y_test)*100
print("Random Forest Algorithm Accuracy Score : {:.2f}%".format(accuracy))
estimator = rf.estimators_[1]
feature_names = [i for i in x_train.columns]
y_train_str = y_train.astype('str')
y_train_str[y_train_str == '0'] = 'no disease'
y_train_str[y_train_str == '1'] = 'disease'
y_train_str = y_train_str.values
#code from https://towardsdatascience.com/how-to-visualize-a-decision-tree-from-a-random-forest-in-python-using-scikit-learn-38ad2d75f21c
export_graphviz(estimator, out_file='tree.dot',
feature_names = feature_names,
class_names = y_train_str,
rounded = True, proportion = True,
label='root',
precision = 2, filled = True)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
from IPython.display import Image
Image(filename = 'tree.png')
print("Plotting the confusion matrix")
y_head_rf = rf.predict(x_test)
cm_rf = confusion_matrix(y_test,y_head_rf)
y_pred_quant = rf.predict_proba(x_test)[:, 1]
print(cm_rf)
plt.figure(figsize=(20,10))
plt.suptitle("Confusion Matrixes",fontsize=5)
plt.subplots_adjust(wspace = 0.8, hspace= 0.8)
plt.subplot(2,3,1)
plt.title("Random Forest Confusion Matrix")
sns.heatmap(cm_rf,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 15})
plt.savefig('confusionMatrix.png')
plt.show()
print("Calculating the sensitivity and Specificity")
total=sum(sum(cm_rf))
sensitivity = cm_rf[0,0]/(cm_rf[0,0]+cm_rf[1,0])
print('Sensitivity : ', sensitivity )
specificity = cm_rf[1,1]/(cm_rf[1,1]+cm_rf[0,1])
print('Specificity : ', specificity)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_quant)
fig, ax = plt.subplots()
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c=".3")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.title('ROC curve for diabetes classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)
auc(fpr, tpr)
perm = PermutationImportance(rf, random_state=1).fit(x_test, y_test)
eli5.show_weights(perm, feature_names = x_test.columns.tolist())
base_features = df.columns.values.tolist()
base_features.remove('heartdisease')
feat_name = 'ca'
pdp_dist = pdp.pdp_isolate(model=rf, dataset=x_test, model_features=base_features, feature=feat_name)
pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
feat_name = 'age'
pdp_dist = pdp.pdp_isolate(model=rf, dataset=x_test, model_features=base_features, feature=feat_name)
pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
feat_name = 'oldpeak'
pdp_dist = pdp.pdp_isolate(model=rf, dataset=x_test, model_features=base_features, feature=feat_name)
pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
feat_name = 'trestbps'
pdp_dist = pdp.pdp_isolate(model=rf, dataset=x_test, model_features=base_features, feature=feat_name)
pdp.pdp_plot(pdp_dist, feat_name)
plt.show()